class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
from IPython.display import display, Audio
from pathlib import Path
import librosa
from scipy.signal import butter, lfilter
def butter_bandpass(lowcut, highcut, fs, order=5):
nyq = 0.5 * fs
low = lowcut / nyq
high = highcut / nyq
b, a = butter(order, [low, high], btype='band')
return b, a
def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
b, a = butter_bandpass(lowcut, highcut, fs, order=order)
y = lfilter(b, a, data)
return y
wav_files = Path('/data/yinjyun/projects/VocalVAE-RNN-update/results_final/model-new_att-True_techCond-none_bi-True_adv-False_trainVar-False_refine-True_et-rnn_drnn-True_chunkCE-True_seqCE-True_var--2_None/')
tech_dir = 'demo_paper-tech-3'
sid_dir = 'demo_paper-sid-1'
We focus on singing technique conversion in this demo page.
The following are the audio samples of Fig. 2(b) in the paper.
The audio files below are all converted from Mel-spectrograms using Griffin-Lim.
Therefore, the audio "original Mel-spectrogram" are the upper bounds of the audio quality for each conversion.
Again, these samples are obtained by inverting from their Mel-spectrograms to audio
sr = 22050
low=150
high=2300
print("f1 straight")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-f1_scales_straight_o.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("m6 belt")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-m6_arpeggios_belt_e.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("f6 breathy")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-f6_scales_breathy_a.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("f9 lip trill")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-f9_scales_lip_trill_e.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("m1 vibrato")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-m1_arpeggios_vibrato_e.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("m3 vocal fry")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-m3_arpeggios_vocal_fry_u.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
sr = 22050
low=150
high=2300
print(color.BOLD + "m8 lip trill" + color.END)
print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Reconstructed Mel-spectrogram")
x2, _ = librosa.core.load(wav_files / tech_dir / 'lip_trill-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x2, rate=sr))
display(Audio(butter_bandpass_filter(x2, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to belt")
x, _ = librosa.core.load(wav_files / tech_dir / 'belt-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to breathy")
x, _ = librosa.core.load(wav_files / tech_dir / 'breathy-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to vibrato")
x, _ = librosa.core.load(wav_files / tech_dir / 'vibrato-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to vocal fry")
x, _ = librosa.core.load(wav_files / tech_dir / 'vocal_fry-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
sr = 22050
low=150
high=2300
print(color.BOLD + "m9 straight" + color.END)
print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Reconstructed Mel-spectrogram")
x, _ = librosa.core.load(wav_files / tech_dir / 'straight-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to belt")
x, _ = librosa.core.load(wav_files / tech_dir / 'belt-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to breathy")
x, _ = librosa.core.load(wav_files / tech_dir / 'breathy-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to lip trill")
x, _ = librosa.core.load(wav_files / tech_dir / 'lip_trill-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to vibrato")
x, _ = librosa.core.load(wav_files / tech_dir / 'vibrato-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to vocal fry")
x, _ = librosa.core.load(wav_files / tech_dir / 'vocal_fry-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
sr = 22050
low=150
high=2300
print(color.BOLD + "f4 breathy" + color.END)
wav_name = 'f4_scales_breathy_o.wav'
print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['source', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Reconstructed Mel-spectrogram")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['breathy', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to belt")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['belt', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to lip trill")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['lip_trill', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to vibrato")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['vibrato', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to vocal fry")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['vocal_fry', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
sr = 22050
low=150
high=2300
print(color.BOLD + "m3 belt" + color.END)
wav_name = 'm3_scales_belt_i.wav'
print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['source', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Reconstructed Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m3', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to f1")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f1', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to f2")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f2', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to f4")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f2', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to m4")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m4', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to m6")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m6', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
sr = 22050
low=150
high=2300
print(color.BOLD + "f4 breathy" + color.END)
wav_name = 'f4_scales_breathy_o.wav'
print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['source', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Reconstructed Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f4', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to f1")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f1', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to f2")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f2', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to m3")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m3', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to m4")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m4', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to m6")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m6', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
sr = 22050
low=150
high=2300
print(color.BOLD + "f2 straight" + color.END)
wav_name = 'f2_scales_straight_e.wav'
print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['source', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Reconstructed Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f2', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to f1")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f1', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to f2")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f4', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to m3")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m3', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to m4")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m4', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
print("Convert to m6")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m6', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))